/**
 ** 流水线华莱士树的实现
**/

module wallace (
    input wire clk,
    input wire rst,
    input wire data_ready,        //要进行运算的数据是否准备就绪的标记
    input wire [31:0] x,
    input wire [31:0] y,
    input wire [3:0] ctrl,
    input wire [4:0] save_no_in, //保留站号
    input wire [4:0] rd_rob_in,
    output reg done,
    output reg [4:0] save_no_out,
    output reg [4:0] rd_rob_out,
    output reg [31:0] result
);
    //循环变量
    integer i, k;

    //Booth算法中拓展后的X和Y
    reg [32:0] ext_y;
    reg [63:0] ext_x, ext_nx, ext_dx, ext_ndx;

    //计算得到的部分积
    reg [15:0] plusOne, plusOne_2;
    reg [63:0] partial [0:15];
    reg [63:0] partial_2 [0:15];

    //一些需要向后传递的信号
    reg [4:0] save_no_stage1, save_no_stage2, save_no_stage3, save_no_stage4;
    reg [4:0] rd_rob_stage1, rd_rob_stage2, rd_rob_stage3, rd_rob_stage4;
    reg done_stage1, done_stage2, done_stage3, done_stage4;


    //第一阶段，准备一些变量备用
    always @(posedge clk) begin
        //Y尾部拓展一位0，方便取Booth编码
        ext_y <= y<<1;
        //准备好X，-X，2X， -2X，方便构建部分积
        ext_x <= {{32{x[31]}}, x};
        ext_nx <= ~({{32{x[31]}}, x})+1;
        ext_dx <= {{32{x[31]}}, x}<<1;
        ext_ndx <= ~({{32{x[31]}}, x}<<1)+1;
        //需要向后传递的信号
        save_no_stage1 <= save_no_in;
        rd_rob_stage1 <= rd_rob_in;
        done_stage1 <= data_ready;
    end

    //第二阶段，部分积结果
    always @(posedge clk) begin
        //开始计算部分积
        for (i=0; i<16; i++) begin
            plusOne[i] <= 1'b0;  //先不用胡老师的办法，用那种方法总是计算的结果不正确
            case ({{ext_y[(i+i)+2], ext_y[(i+i)+1], ext_y[i+i]}})
                3'b000: partial[i] <= 64'b0000000000000000;
                3'b001: partial[i] <= ext_x << (i+i); 
                3'b010: partial[i] <= ext_x << (i+i);
                3'b011: partial[i] <= ext_dx << (i+i);
                3'b100: partial[i] <= ext_ndx << (i+i);
                3'b101: partial[i] <= ext_nx << (i+i);
                3'b110: partial[i] <= ext_nx << (i+i);
                3'b111: partial[i] <= 64'b0000000000000000;
                default: partial[i] <= 64'b0000000000000000;
            endcase
        end

        //需要向后传递的信号
        save_no_stage2 <= save_no_stage1;
        rd_rob_stage2 <= rd_rob_stage1;
        done_stage2 <= done_stage1;
    end

    //第三阶段，华莱士树累加部分积
    always @(posedge clk) begin
        for (k=0; k<16; k++) begin
            partial_2[k] <= partial[k];
        end

        plusOne_2 <= plusOne;
        //需要向后传递的信号
        save_no_stage3 <= save_no_stage2;
        rd_rob_stage3 <= rd_rob_stage2;
        done_stage3 <= done_stage2;
    end

    //华莱士树需要的一些信号
    wire [13:0] carry [0:64];
    wire [63:0] wallaceSum;
    wire [63:0] wallaceCarry;
    wire [15:0] nowx [0:63];
    assign carry[0] = plusOne_2[13:0];
    genvar j;
    //使用generate语句构建64个Item，连好线，就会得到华莱士树
    generate
        for (j=0; j<64; j=j+1) begin
            assign nowx[j] = {partial_2[0][j], partial_2[1][j], partial_2[2][j], partial_2[3][j], partial_2[4][j], partial_2[5][j], partial_2[6][j], partial_2[7][j], partial_2[8][j], partial_2[9][j], partial_2[10][j], partial_2[11][j], partial_2[12][j], partial_2[13][j], partial_2[14][j], partial_2[15][j]};
            wallaceItem item(.x(nowx[j]), .carry_pre(carry[j]), .sum(wallaceSum[j]), .carry(wallaceCarry[j]), .carry_next(carry[j+1]));
        end
    endgenerate

    //最终执行加法的部件
    reg [63:0] x_final;
    reg [63:0] y_final;
    wire [63:0] z;
    //第四阶段，完成最后的加法运算，得到最终的乘法结果
    always @(posedge clk) begin
        x_final <= {wallaceCarry[62:0], plusOne_2[14]};
        y_final <=  wallaceSum;

        //需要向后传递的信号
        save_no_stage4 <= save_no_stage3;
        rd_rob_stage4 <= rd_rob_stage3;
        done_stage4 <= done_stage3;
    end

    //实例化64位超前进位加法器
    adder_64bits add_final(.a(x_final), .b(y_final), .cin(plusOne_2[15]), .result(z), .cout(), .over());

    //第五阶段，输出最终的运算结果
    always @(posedge clk) begin
        result <= z[31:0];
        save_no_out <= save_no_stage4;
        rd_rob_out <= rd_rob_stage4;
        done <= done_stage4;
    end
endmodule